Add test verification workflow and safer generation#3
Conversation
There was a problem hiding this comment.
Pull request overview
This PR extends the AutoCode MCP toolchain with a new test-verification step and broader support for custom-named solutions/generators, while hardening test generation to reduce the risk of unintended file deletion. It also adds stress-test statistics, updates workflow documentation, and bumps the package/plugin version to 0.7.0.
Changes:
- Add
problem_verify_teststool and register it in the MCP server/tooling docs/tests. - Add custom solution naming support across build/run, stress testing, test generation/verification; add generator
extra_argsand stress-test summary statistics. - Harden
problem_generate_testsoutput directory handling and only clear generated.in/.ansfiles; bump version to0.7.0.
Reviewed changes
Copilot reviewed 20 out of 21 changed files in this pull request and generated 4 comments.
Show a summary per file
| File | Description |
|---|---|
src/autocode_mcp/tools/test_verify.py |
New verification tool (file pairing, answer consistency, validator, empty-file checks). |
src/autocode_mcp/server.py |
Registers the new problem_verify_tests tool (tool count increases). |
src/autocode_mcp/tools/problem.py |
Adds output_dir + sol_name, validates output dir, and clears only generated test artifacts. |
src/autocode_mcp/tools/stress_test.py |
Adds sol_name/brute_name, generator types cycling, extra_args, and runtime statistics summary. |
src/autocode_mcp/tools/solution.py |
Adds name support to build/run and uses shared resolve_source. |
src/autocode_mcp/tools/{validator,generator,checker,interactor}.py |
Refactors duplicated source resolution into resolve_source, adds canonical path + binary size reporting. |
src/autocode_mcp/tools/mixins.py |
Introduces ResolvedSource + resolve_source() helper. |
scripts/workflow_guard.py |
Updates inferred state for custom-named solutions (prefix match) and adds workflow text mentioning verification. |
tests/test_tools/test_problem.py |
Adds regression tests for safer output dir handling and custom sol_name. |
tests/test_e2e_mcp.py |
Updates expected tool count for MCP list-tools. |
pyproject.toml, src/autocode_mcp/__init__.py, uv.lock, .claude-plugin/plugin.json, packaging tests |
Version bump to 0.7.0. |
CHANGELOG.md, CLAUDE.md |
Workflow/docs updates reflecting new tool and parameters. |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| problem_root = os.path.abspath(problem_dir) | ||
| raw_output_dir = output_dir or "tests" | ||
| tests_dir = raw_output_dir | ||
| if not os.path.isabs(tests_dir): | ||
| tests_dir = os.path.join(problem_root, tests_dir) | ||
| tests_dir = os.path.abspath(tests_dir) | ||
|
|
||
| try: | ||
| common = os.path.commonpath([problem_root, tests_dir]) | ||
| except ValueError: | ||
| common = "" | ||
| if os.path.normcase(common) != os.path.normcase(problem_root): | ||
| return None, ToolResult.fail("output_dir must be inside problem_dir") | ||
|
|
||
| if os.path.normcase(tests_dir) == os.path.normcase(problem_root): | ||
| return None, ToolResult.fail("output_dir cannot be the problem_dir root") | ||
|
|
||
| reserved_dirs = {"files", "solutions", "statements"} | ||
| for reserved in reserved_dirs: | ||
| reserved_path = os.path.join(problem_root, reserved) | ||
| if os.path.normcase(os.path.commonpath([reserved_path, tests_dir])) == os.path.normcase( | ||
| reserved_path | ||
| ): | ||
| return None, ToolResult.fail(f"output_dir cannot be reserved directory: {reserved}") | ||
|
|
There was a problem hiding this comment.
_resolve_tests_dir() uses os.path.abspath/commonpath checks, which can be bypassed via symlinks (e.g., output_dir points to a symlink under problem_dir that resolves outside). That can still cause _clear_generated_tests() to delete .in/.ans files in external locations. Consider validating with os.path.realpath() (or Path.resolve()) for both problem_dir and tests_dir and/or rejecting symlinked output directories before clearing.
| problem_root = os.path.abspath(problem_dir) | |
| raw_output_dir = output_dir or "tests" | |
| tests_dir = raw_output_dir | |
| if not os.path.isabs(tests_dir): | |
| tests_dir = os.path.join(problem_root, tests_dir) | |
| tests_dir = os.path.abspath(tests_dir) | |
| try: | |
| common = os.path.commonpath([problem_root, tests_dir]) | |
| except ValueError: | |
| common = "" | |
| if os.path.normcase(common) != os.path.normcase(problem_root): | |
| return None, ToolResult.fail("output_dir must be inside problem_dir") | |
| if os.path.normcase(tests_dir) == os.path.normcase(problem_root): | |
| return None, ToolResult.fail("output_dir cannot be the problem_dir root") | |
| reserved_dirs = {"files", "solutions", "statements"} | |
| for reserved in reserved_dirs: | |
| reserved_path = os.path.join(problem_root, reserved) | |
| if os.path.normcase(os.path.commonpath([reserved_path, tests_dir])) == os.path.normcase( | |
| reserved_path | |
| ): | |
| return None, ToolResult.fail(f"output_dir cannot be reserved directory: {reserved}") | |
| problem_root = os.path.realpath(problem_dir) | |
| raw_output_dir = output_dir or "tests" | |
| tests_dir = raw_output_dir | |
| if not os.path.isabs(tests_dir): | |
| tests_dir = os.path.join(problem_root, tests_dir) | |
| tests_dir = os.path.abspath(tests_dir) | |
| resolved_tests_dir = os.path.realpath(tests_dir) | |
| try: | |
| common = os.path.commonpath([problem_root, resolved_tests_dir]) | |
| except ValueError: | |
| common = "" | |
| if os.path.normcase(common) != os.path.normcase(problem_root): | |
| return None, ToolResult.fail("output_dir must be inside problem_dir") | |
| if os.path.normcase(resolved_tests_dir) == os.path.normcase(problem_root): | |
| return None, ToolResult.fail("output_dir cannot be the problem_dir root") | |
| reserved_dirs = {"files", "solutions", "statements"} | |
| for reserved in reserved_dirs: | |
| reserved_path = os.path.realpath(os.path.join(problem_root, reserved)) | |
| try: | |
| reserved_common = os.path.commonpath([reserved_path, resolved_tests_dir]) | |
| except ValueError: | |
| reserved_common = "" | |
| if os.path.normcase(reserved_common) == os.path.normcase(reserved_path): | |
| return None, ToolResult.fail(f"output_dir cannot be reserved directory: {reserved}") | |
| if os.path.exists(tests_dir) and os.path.islink(tests_dir): | |
| return None, ToolResult.fail(f"output_dir cannot be a symlink: {tests_dir}") |
| expected_indices = list(range(1, len(numeric_indices) + 1)) | ||
| missing_indices = [ | ||
| idx for idx in expected_indices if idx not in numeric_indices | ||
| ] if numeric_indices else [] | ||
| duplicate_indices = sorted( | ||
| idx for idx in set(numeric_indices) if numeric_indices.count(idx) > 1 |
There was a problem hiding this comment.
_check_file_count() computes expected_indices as range(1, len(numeric_indices)+1), which under-reports gaps when filenames skip ahead (e.g., 01.in and 100.in would only report missing 2). If you want to enforce contiguous numbering, expected_indices should be based on max(numeric_indices) (and possibly also validate the set starts at 1).
| expected_indices = list(range(1, len(numeric_indices) + 1)) | |
| missing_indices = [ | |
| idx for idx in expected_indices if idx not in numeric_indices | |
| ] if numeric_indices else [] | |
| duplicate_indices = sorted( | |
| idx for idx in set(numeric_indices) if numeric_indices.count(idx) > 1 | |
| numeric_index_set = set(numeric_indices) | |
| expected_indices = list(range(1, max(numeric_indices) + 1)) if numeric_indices else [] | |
| missing_indices = [ | |
| idx for idx in expected_indices if idx not in numeric_index_set | |
| ] | |
| duplicate_indices = sorted( | |
| idx for idx in numeric_index_set if numeric_indices.count(idx) > 1 |
| "tests_generated": any((root / "tests").glob("*.in")) if (root / "tests").exists() else False, | ||
| "generated_test_count": len(list((root / "tests").glob("*.in"))) if (root / "tests").exists() else 0, | ||
| "tests_verified": False, | ||
| "packaged": (root / "problem.xml").exists(), | ||
| } |
There was a problem hiding this comment.
workflow_guard introduces state["tests_verified"] and advertises a required problem_verify_tests step, but the state is never updated in post_tool() and pre_tool() still allows problem_pack_polygon after only tests_generated. This makes the new verification step unenforced in the guard. Update post_tool() to set tests_verified based on problem_verify_tests output (e.g., data.get("passed")) and gate problem_pack_polygon on tests_verified.
| # 确定文件名 | ||
| source_name = f"{solution_type}.cpp" | ||
| source_path = os.path.join(solutions_dir, source_name) | ||
| effective_name = name or solution_type | ||
| exe_ext = get_exe_extension() | ||
| canonical_path = os.path.join(solutions_dir, f"{effective_name}.cpp") | ||
| binary_path = os.path.join(solutions_dir, f"{effective_name}{exe_ext}") | ||
|
|
||
| # 保存代码 | ||
| # 保存到标准位置(其他工具依赖此路径) | ||
| try: | ||
| with open(source_path, "w", encoding="utf-8") as f: | ||
| f.write(code) | ||
| with open(canonical_path, "w", encoding="utf-8") as f: | ||
| f.write(resolved.code) |
There was a problem hiding this comment.
The comment says this is saved to a “canonical/standard location that other tools depend on”, but canonical_path is now derived from effective_name. Other parts of the codebase (e.g., Polygon packing XML uses solutions/sol.cpp & solutions/brute.cpp) still assume the standard filenames, so building with name!=solution_type can break downstream workflow. Consider either always also writing/copying to solutions/{solution_type}.cpp for backward compatibility, or updating downstream tools (e.g., problem_pack_polygon) to use the chosen name.
Summary
Verification